import os
import re
import tkinter as tk
from tkinter import ttk, filedialog, messagebox
from typing import List, Tuple
import threading


class EnglishRemover:
    """英文段落移除器，用于从文本中移除整段英文内容"""
    
    @staticmethod
    def is_english_paragraph(text: str) -> bool:
        """
        判断是否为需要删除的英文段落
        条件：英文字符超过25个且汉字少于4个
        """
        # 计算英文字符数量（字母）
        english_chars = len(re.findall(r'[a-zA-Z]', text))
        
        # 计算汉字数量
        chinese_chars = len(re.findall(r'[\u4e00-\u9fff]', text))
        
        # 降低阈值，更积极地删除英文段落
        return english_chars > 25 and chinese_chars < 4
    
    @staticmethod
    def remove_long_english_sentences(paragraph: str) -> Tuple[str, int]:
        """
        移除段落中连续英文字符超过阈值的英文句子
        
        Args:
            paragraph: 段落文本
            
        Returns:
            Tuple[str, int]: 处理后的段落文本和删除的句子数
        """
        if not paragraph.strip():
            return paragraph, 0
        
        # 阈值调低到15个英文字符
        ENGLISH_CHAR_THRESHOLD = 15
        
        # 移除markdown格式的斜体和粗体标记，避免影响句子识别
        clean_paragraph = re.sub(r'\*\*|\*', '', paragraph)
        
        # 先查找所有中文内容的位置，避免误删中文
        chinese_segments = []
        for match in re.finditer(r'[\u4e00-\u9fff]+[，。！？：；"\']?', clean_paragraph):
            chinese_segments.append((match.start(), match.end()))
        
        # 使用更通用的句子识别模式，处理各种标点符号和格式
        english_patterns = [
            # 完整的英文句子（包括句尾标点）
            r'[A-Za-z][A-Za-z0-9\s\,\;\:\"\'\-\(\)\[\]\{\}]*?[\.\?\!]',
            
            # 单独的英文片段（包括被中文分隔的）
            r'(?:^|\s|\n)[A-Za-z][A-Za-z0-9\s\,\;\:\"\'\-\(\)\[\]\{\}]{8,}',
            
            # 引号包含的英文
            r'[\"\'][A-Za-z][\w\s\,\;\:\.\?\!\-\(\)\[\]\{\}]*?[\"\']',
        ]
        
        # 构建一个掩码，标记哪些区域是中文（不应该删除）
        mask = [False] * len(clean_paragraph)
        for start, end in chinese_segments:
            for i in range(start, end):
                mask[i] = True
        
        # 处理英文句子，从后向前
        potential_deletions = []
        for pattern in english_patterns:
            for match in re.finditer(pattern, clean_paragraph):
                start, end = match.span()
                text = match.group()
                
                # 统计英文字符数量
                english_chars_count = len(re.findall(r'[a-zA-Z]', text))
                
                # 检查是否有足够的英文字符
                if english_chars_count > ENGLISH_CHAR_THRESHOLD:
                    # 检查这部分内容是否与已标记的中文重叠
                    has_chinese = any(mask[i] for i in range(start, end) if i < len(mask))
                    
                    if not has_chinese:
                        potential_deletions.append((start, end))
        
        # 合并重叠的区域
        if potential_deletions:
            potential_deletions.sort()
            merged_deletions = [potential_deletions[0]]
            
            for current in potential_deletions[1:]:
                prev_start, prev_end = merged_deletions[-1]
                curr_start, curr_end = current
                
                # 如果当前区域与前一个重叠或相邻，则合并
                if curr_start <= prev_end + 1:
                    merged_deletions[-1] = (prev_start, max(prev_end, curr_end))
                else:
                    merged_deletions.append(current)
            
            # 从后向前删除
            result = list(clean_paragraph)
            for start, end in sorted(merged_deletions, reverse=True):
                # 删除前检查是否会破坏中文句子
                safe_to_delete = True
                for i in range(start, end):
                    if i < len(mask) and mask[i]:
                        safe_to_delete = False
                        break
                
                if safe_to_delete:
                    # 将要删除的区域替换为空格
                    for i in range(start, end):
                        if i < len(result):
                            result[i] = ' '
            
            result_paragraph = ''.join(result)
        else:
            result_paragraph = clean_paragraph
        
        # 计算删除的句子数
        removed_count = len(merged_deletions) if potential_deletions else 0
        
        # 清理残留的标点和多余空格
        result_paragraph = EnglishRemover._clean_remnants(result_paragraph)
        
        return result_paragraph, removed_count
    
    @staticmethod
    def _clean_remnants(text: str) -> str:
        """清理残留的标点和多余空格"""
        # 清理孤立的标点符号
        text = re.sub(r'(?<=\s)[,\.;:!\?]+(?=\s)', '', text)
        text = re.sub(r'^\s*[,\.;:!\?]+\s*', '', text)
        text = re.sub(r'\s*[,\.;:!\?]+\s*$', '', text)
        
        # 删除单个英文单词
        text = re.sub(r'(?<=\s)[a-zA-Z]{1,10}(?=\s)', '', text)
        
        # 清理行首行尾的单词残留
        text = re.sub(r'^\s*[a-zA-Z]+[\s\.!\?,;:]*', '', text)
        text = re.sub(r'[\s\.!\?,;:]*[a-zA-Z]+\s*$', '', text)
        
        # 合并多个空格
        text = re.sub(r'\s{2,}', ' ', text)
        
        # 清理行首行尾空格
        text = re.sub(r'^\s+|\s+$', '', text, flags=re.MULTILINE)
        
        # 删除空行
        text = re.sub(r'\n\s*\n', '\n\n', text)
        
        return text
    
    @staticmethod
    def process_file(file_path: str, overwrite: bool = True) -> Tuple[str, int]:
        """
        处理单个文件，移除英文段落和长英文句子
        
        Args:
            file_path: 文件路径
            overwrite: 是否覆盖原文件
            
        Returns:
            Tuple[str, int]: 输出文件路径和删除的段落/句子数
        """
        try:
            # 读取文件内容
            with open(file_path, 'r', encoding='utf-8') as f:
                content = f.read()
            
            # 按段落分割
            paragraphs = re.split(r'\n\s*\n', content)
            
            # 过滤英文段落并处理英文句子
            filtered_paragraphs = []
            total_removed = 0
            
            for para in paragraphs:
                # 先判断是否为整个需要删除的英文段落
                if EnglishRemover.is_english_paragraph(para):
                    total_removed += 1
                else:
                    # 处理段落中的英文句子
                    processed_para, removed_sentences = EnglishRemover.remove_long_english_sentences(para)
                    filtered_paragraphs.append(processed_para)
                    total_removed += removed_sentences
            
            # 重新组合内容
            new_content = '\n\n'.join(filtered_paragraphs)
            
            # 确定输出路径
            if overwrite:
                output_path = file_path
            else:
                file_name, file_ext = os.path.splitext(file_path)
                output_path = f"{file_name}-去英文段{file_ext}"
            
            # 写入文件
            with open(output_path, 'w', encoding='utf-8') as f:
                f.write(new_content)
                
            return output_path, total_removed
            
        except Exception as e:
            print(f"处理文件 {file_path} 时出错: {str(e)}")
            return file_path, 0


class TextProcessorApp:
    """文本处理应用程序"""
    
    def __init__(self, root):
        self.root = root
        self.root.title("文本英文段落批量去除工具")
        self.root.geometry("800x600")
        self.root.minsize(700, 500)
        
        # 文件列表
        self.file_paths = []
        
        # 创建界面
        self._create_widgets()
        
        # 配置拖放功能
        self._setup_drag_drop()
    
    def _create_widgets(self):
        """创建界面组件"""
        # 主框架
        main_frame = ttk.Frame(self.root, padding="10")
        main_frame.pack(fill=tk.BOTH, expand=True)
        
        # 上部分 - 文件列表
        list_frame = ttk.LabelFrame(main_frame, text="文件列表", padding="5")
        list_frame.pack(fill=tk.BOTH, expand=True, pady=(0, 10))
        
        # 创建滚动条
        scrollbar = ttk.Scrollbar(list_frame)
        scrollbar.pack(side=tk.RIGHT, fill=tk.Y)
        
        # 创建文件列表
        self.file_listbox = tk.Listbox(list_frame, selectmode=tk.EXTENDED, 
                                      yscrollcommand=scrollbar.set)
        self.file_listbox.pack(fill=tk.BOTH, expand=True)
        scrollbar.config(command=self.file_listbox.yview)
        
        # 文件操作按钮
        file_btn_frame = ttk.Frame(list_frame)
        file_btn_frame.pack(fill=tk.X, pady=(5, 0))
        
        # 添加文件按钮
        add_file_btn = ttk.Button(file_btn_frame, text="添加文件", 
                                 command=self._add_files)
        add_file_btn.pack(side=tk.LEFT, padx=(0, 5))
        
        # 添加文件夹按钮
        add_folder_btn = ttk.Button(file_btn_frame, text="添加文件夹", 
                                   command=self._add_folder)
        add_folder_btn.pack(side=tk.LEFT, padx=(0, 5))
        
        # 移除选中按钮
        remove_selected_btn = ttk.Button(file_btn_frame, text="移除选中", 
                                        command=self._remove_selected)
        remove_selected_btn.pack(side=tk.LEFT)
        
        # 中部分 - 拖放区域
        drop_frame = ttk.LabelFrame(main_frame, text="拖放文件或文件夹到此处", padding="10")
        drop_frame.pack(fill=tk.BOTH, expand=False, pady=(0, 10), ipady=30)
        
        self.drop_label = ttk.Label(drop_frame, text="将文件或文件夹拖放到此区域")
        self.drop_label.pack(fill=tk.BOTH, expand=True)
        
        # 选项区域
        options_frame = ttk.Frame(main_frame)
        options_frame.pack(fill=tk.X, pady=(0, 10))
        
        # 覆盖选项
        self.overwrite_var = tk.BooleanVar(value=True)
        overwrite_check = ttk.Checkbutton(options_frame, text="覆盖原txt文件", 
                                         variable=self.overwrite_var)
        overwrite_check.pack(side=tk.LEFT)
        
        # 按钮区域
        buttons_frame = ttk.Frame(main_frame)
        buttons_frame.pack(fill=tk.X)
        
        # 清除列表按钮
        clear_btn = ttk.Button(buttons_frame, text="清除列表", 
                              command=self._clear_list)
        clear_btn.pack(side=tk.RIGHT, padx=5)
        
        # 处理按钮
        self.process_btn = ttk.Button(buttons_frame, text="去除英文段落", 
                                     command=self._process_files)
        self.process_btn.pack(fill=tk.X, expand=True, pady=10, padx=5)
    
    def _add_files(self):
        """添加文件按钮回调"""
        files = filedialog.askopenfilenames(
            title="选择TXT文件",
            filetypes=[("文本文件", "*.txt"), ("所有文件", "*.*")]
        )
        
        for file_path in files:
            if file_path.lower().endswith('.txt'):
                self._add_path(file_path)
    
    def _add_folder(self):
        """添加文件夹按钮回调"""
        folder = filedialog.askdirectory(title="选择文件夹")
        if folder:
            self._add_path(folder)
    
    def _remove_selected(self):
        """移除选中的文件"""
        selected_indices = self.file_listbox.curselection()
        if not selected_indices:
            return
            
        # 从后往前删除，避免索引变化
        for idx in sorted(selected_indices, reverse=True):
            self.file_paths.pop(idx)
            self.file_listbox.delete(idx)
        
    def _setup_drag_drop(self):
        """设置拖放功能"""
        try:
            # 只有在使用TkinterDnD时才进行设置
            if hasattr(self.root, 'tk_dnd'):
                self.drop_label.drop_target_register("DND_Files")
                self.drop_label.dnd_bind('<<Drop>>', self._on_drop)
        except Exception as e:
            print(f"设置拖放功能时出错: {str(e)}")
            
            # 添加手动拖放提示
            self.drop_label.configure(text="拖放功能不可用，请使用'添加文件'按钮")
    
    def _on_drop(self, event):
        """处理拖放事件"""
        try:
            # 获取拖放的文件/文件夹路径
            data = event.data
            
            # 去除多余的字符和引号
            paths = []
            
            # 处理Windows路径格式 {C:/path} {D:/path}
            if '{' in data:
                paths = [p.strip('{}') for p in data.split('} {')]
            # 处理Unix路径格式，通常是空格分隔
            else:
                paths = data.split()
            
            # 处理每个路径
            for path in paths:
                # 去除可能的引号
                path = path.strip('"\'')
                self._add_path(path)
                
        except Exception as e:
            print(f"拖放处理出错: {str(e)}")
            messagebox.showerror("拖放错误", f"处理拖放时出错: {str(e)}\n请使用'添加文件'按钮。")
    
    def _add_path(self, path):
        """添加文件路径到列表"""
        try:
            # 移除可能的花括号和引号
            path = path.strip('{}\'\"')
            
            if os.path.isdir(path):
                # 如果是文件夹，获取所有txt文件
                for root, _, files in os.walk(path):
                    for file in files:
                        if file.lower().endswith('.txt'):
                            file_path = os.path.join(root, file)
                            self._add_file_to_list(file_path)
            
            elif os.path.isfile(path) and path.lower().endswith('.txt'):
                # 如果是txt文件
                self._add_file_to_list(path)
        except Exception as e:
            print(f"添加路径 {path} 时出错: {str(e)}")
    
    def _add_file_to_list(self, file_path):
        """将文件添加到列表中，避免重复"""
        # 规范化路径，确保比较时不会因为路径格式不同而误判
        norm_path = os.path.normpath(file_path)
        
        # 检查是否已在列表中
        if norm_path not in [os.path.normpath(p) for p in self.file_paths]:
            self.file_paths.append(norm_path)
            self.file_listbox.insert(tk.END, norm_path)
            print(f"已添加文件: {norm_path}")  # 调试信息
    
    def _clear_list(self):
        """清除文件列表"""
        self.file_listbox.delete(0, tk.END)
        self.file_paths = []
    
    def _process_files(self):
        """处理文件"""
        if not self.file_paths:
            messagebox.showinfo("提示", "请先添加要处理的文件")
            return
        
        # 禁用按钮，防止重复点击
        self.process_btn.config(state=tk.DISABLED)
        
        # 在新线程中处理文件
        threading.Thread(target=self._process_files_thread, daemon=True).start()
    
    def _process_files_thread(self):
        """在线程中处理文件"""
        overwrite = self.overwrite_var.get()
        total_removed = 0
        processed_files = 0
        
        try:
            for file_path in self.file_paths:
                _, removed_count = EnglishRemover.process_file(file_path, overwrite)
                total_removed += removed_count
                processed_files += 1
                
                # 更新进度
                self.root.after(0, lambda idx=processed_files, total=len(self.file_paths): 
                               self.process_btn.config(
                                   text=f"处理中... ({idx}/{total})"
                               ))
            
            # 处理完成后显示结果
            self.root.after(0, lambda: messagebox.showinfo(
                "处理完成", 
                f"成功处理 {processed_files} 个文件，共删除 {total_removed} 个英文段落"
            ))
            
        except Exception as e:
            self.root.after(0, lambda: messagebox.showerror("错误", f"处理过程中发生错误: {str(e)}"))
        
        finally:
            # 恢复按钮状态
            self.root.after(0, lambda: self.process_btn.config(
                state=tk.NORMAL, 
                text="去除英文段落"
            ))


def main():
    """主函数"""
    try:
        # 尝试使用tkinterdnd2
        from tkinterdnd2 import TkinterDnD
        root = TkinterDnD.Tk()
        print("成功加载tkinterdnd2")
        
        # 标记使用了tkdnd
        root.tk_dnd = True
        
    except ImportError:
        # 如果没有tkinterdnd2，使用普通的Tk
        root = tk.Tk()
        root.tk_dnd = False
        
        messagebox.showwarning(
            "功能受限", 
            "未检测到tkinterdnd2库，拖放功能将不可用。\n"
            "请安装tkinterdnd2: pip install tkinterdnd2"
        )
    
    app = TextProcessorApp(root)
    root.mainloop()


if __name__ == "__main__":
    main()
